function [synidx,dbdata,synidx2ord,typetable] = dbcollate(dir_tmp,synid_offset,col_type)
%function that collates data from a single directory, such as the output
%properties directory after syn_collate, into a relational database format,
%the item ID's will be based on the ordinals.  So you might need to
%ordinalize.
%Synatax:   [dbdata] = dbcollate();
%Input:     dir_tmp = the directory of interest
%           synid_offset = use offset if you want the synid to start other
%               than 1.
%           col_type = 'relational' or 'regression', join the data either for a
%               relational database or for regression testing. default =
%               'regression'
%Output:    dbdata = the formated databse data
%           synidxdata = A table to synidx to ordinal data
%           typetable = A table to types to actual protein names
%           synidx = How many synapses were collated
if nargin==0||isempty(dir_tmp)
    dir_tmp = uigetdir2('','Directory where the files are located');    %get the directory
end
if nargin<2||isempty(synid_offset)
    synid_offset = 0;
end
if nargin<3
    col_type = 'regression';
end
%Now sort through the directory and find the
[filenames,names] = dir_sort(dir_tmp);
%Now open each file append and save
synidx = synid_offset;
entryidx = 1;
col_label_tmp = {'SynapseID','MouseID','Layer';[],[],[];[],[],[];[],[],[]};
for j = 1:size(filenames,2) %go through the file sets
    filename_tmp = filenames{j};
    ord2synidx = [];  %initialize/reset
    msnum = names{j};  %mouse number
    for i = 1:size(filename_tmp,1)      %now append the files
        typetable{i,j} = filename_tmp(i,:);
        if strcmp('regression',col_type)&&j==1     %generate the column labels for regression, once
            str_tmp = {[num2str(i),'_aveIntensity'];[num2str(i),'_volume'];[num2str(i),'_totalIntensity'];[num2str(i),'_distance']};
            col_label_tmp = horzcat(col_label_tmp,str_tmp);
        end
        try     %file could be empty
            data_tmp = single(dlmread([dir_tmp,filesep,filename_tmp(i,:)],',',1,0));
            ordinals = data_tmp(:,end);     %pull ordinals for ease of use
            %normalize
            qdata_tmp = quantilenormloco(data_tmp);   %quantile normalization
            mdata_tmp = manorm(data_tmp,'Method','median','LogData',1);   %median normalize
            for k = 1:size(data_tmp,1)
                if i==1     %first time through prime the ordinal index
                    synidx = synidx+1;
                    ord2synidx(ordinals(k)) = synidx;     %note, the first file is very important in this case, it must be one that contains all ordinals.
                    synidx2ord(synidx-synid_offset,:) = [synidx ordinals(k)];    %make an index to ordinal table
                end
                if ~isnan(data_tmp(k))
                    switch col_type
                        case 'relational'
                            dbdata(entryidx,:) = [ord2synidx(ordinals(k)),str2num(msnum(1,1:end-1)),str2num(msnum(1,end)),data_tmp(k,1:end-1),i];
                            qdbdata(entryidx,:) = [ord2synidx(ordinals(k)),str2num(msnum(1,1:end-1)),str2num(msnum(1,end)),qdata_tmp(k,1:end-1),i];
                            mdbdata(entryidx,:) = [ord2synidx(ordinals(k)),str2num(msnum(1,1:end-1)),str2num(msnum(1,end)),mdata_tmp(k,1:end-1),i];
                            %[synapseID,mouseID,layer,aveintensity,vol,totalint,dist,type]
                            %note: the directory much contain dummy files for
                            %unused channels, for 'type' as it is written here to
                            %work. So, VGlut1 type synapses, must have a VGluT2
                            %dummy file, and geph, gad dummy files as well.  Also
                            %sorting must be consistent
                            entryidx = entryidx+1;
                        otherwise   %regression
                            %[synapseID,mouseID,layer,aveintensity,vol,totalint,dist,type]
                            depth = size(data_tmp(k,1:end-1),2);    %how many properties are there?
                            id = [ord2synidx(ordinals(k)),str2num(msnum(1,1:end-1)),str2num(msnum(1,end))];     %lets get the identification string down
                            id = repmat(id,[1,1,depth]);
                            dbdata(ord2synidx(ordinals(k))-synid_offset,1:3,:) = id;     %put in place the id
                            dbdata(ord2synidx(ordinals(k))-synid_offset,i+3,:) = reshape(data_tmp(k,1:end-1),1,1,depth);
                            qdbdata(ord2synidx(ordinals(k))-synid_offset,1:3,:) = id;     %put in place the id
                            qdbdata(ord2synidx(ordinals(k))-synid_offset,i+3,:) = reshape(data_tmp(k,1:end-1),1,1,depth);
                            mdbdata(ord2synidx(ordinals(k))-synid_offset,1:3,:) = id;     %put in place the id
                            mdbdata(ord2synidx(ordinals(k))-synid_offset,i+3,:) = reshape(data_tmp(k,1:end-1),1,1,depth);
                            %note: the directory much contain dummy files for
                            %unused channels, for 'type' as it is written here to
                            %work. So, VGlut1 type synapses, must have a VGluT2
                            %dummy file, and geph, gad dummy files as well.  Also
                            %sorting must be consistent
                    end
                else    %need to pad for regression case
                    if strcmp('regression',col_type)    %only need to pad for the regression case
                        depth = size(data_tmp(k,1:end-1),2);    %how many properties are there?
                        blank = zeros(1,1,depth);
                        dbdata(synidx-synid_offset,i+3,:) = blank;
                        qdbdata(synidx-synid_offset,i+3,:) = blank;
                        mdbdata(synidx-synid_offset,i+3,:) = blank;
                    end
                end
            end
        catch
            warning([filename_tmp(i,:),' is empty!']);
            if strcmp('regression',col_type)    %only need to pad for the regression case
                if ~isempty(dbdata)     %not the first time through
                    depth = size(data_tmp(k,1:end-1),2);    %how many properties are there?
                    blank = zeros(1,1,depth);
                    dbdata(synidx-synid_offset,i+3,:) = blank;
                    qdbdata(synidx-synid_offset,i+3,:) = blank;
                    mdbdata(synidx-synid_offset,i+3,:) = blank;
                else    %first time through
                    id = ['1',str2num(msnum(1,1:end-1)),str2num(msnum(1,end))];     %lets get the identification string down
                    dbdata = id;     %put in place the id
                    qdbdata = id;     %put in place the id
                    mdbdata = id;     %put in place the id
                    dbdata(1,4,:) = 0;
                    qdbdata(1,4,:) = 0;
                    mdbdata(1,4,:) = 0;
                end
            end
        end
    end
end

%output data
switch col_type
    case 'relational'
        mkdir(dir_tmp,'dbdata');
        col_label = {'SynapseID','MouseID','Layer','aveIntensity','Volume','totalIntensity','distance','type'};
        dbdata = dataset({dbdata,col_label{:}});
        sav2csv(dbdata,['dbdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'dbdata']);     %save data
        qdbdata = dataset({qdbdata,col_label{:}});
        sav2csv(qdbdata,['qdbdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'dbdata']);     %save data
        mdbdata = dataset({mdbdata,col_label{:}});
        sav2csv(mdbdata,['mdbdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'dbdata']);     %save data
    otherwise
        %first reformat the data
        db_tmp = dbdata(:,:,1);  %initialize with the first data slice
        col_label = col_label_tmp(1,:);
        qdb_tmp = qdbdata(:,:,1);
        mdb_tmp = mdbdata(:,:,1);
        for l = 2:size(dbdata,3)
            db_tmp = horzcat(db_tmp,dbdata(:,4:end,l));
            qdb_tmp = horzcat(qdb_tmp,dbdata(:,4:end,l));
            mdb_tmp = horzcat(mdb_tmp,dbdata(:,4:end,l));
            col_label = horzcat(col_label,col_label_tmp(l,4:end));
        end
        mkdir(dir_tmp,'regressionData');
        try
        dbdata = dataset({db_tmp,col_label{:}});
        catch
            keyboard
        end
        sav2csv(dbdata,['rdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'regressionData']);     %save data
        qdbdata = dataset({qdb_tmp,col_label{:}});
        sav2csv(qdbdata,['qrdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'regressionData']);     %save data
        mdbdata = dataset({mdb_tmp,col_label{:}});
        sav2csv(mdbdata,['mrdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'regressionData']);     %save data
end
col_name = {'synidx','ordinal'};
synidxdata = dataset({synidx2ord,col_name{:}});
switch col_type
    case 'relational'
        sav2csv(synidxdata,['synidxdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'dbdata']);   %save synapse ordinal list
        sav2csv(typetable,['typedata',datestr(now,30),'.csv'],[dir_tmp,filesep,'dbdata']);     %save file order
    otherwise
        sav2csv(synidxdata,['synidxdata',datestr(now,30),'.csv'],[dir_tmp,filesep,'regressionData']);   %save synapse ordinal list
        sav2csv(typetable,['typedata',datestr(now,30),'.csv'],[dir_tmp,filesep,'regressionData']);     %save file order
end

%-----------------------------------------------------------------------------------------------------------------------------
function [filenames_out,uq_names] = dir_sort(dir_tmp)
%give the directory and it will sort out the files in it and put a cell
%array of filenames
dir_struct = dir(dir_tmp);  %grab the directory information
idx = [dir_struct.isdir];   %grab all of the isdir numbers
names = {dir_struct.name};   %grab the all of the names in the root
filenames = names(~idx);
%filename_tmp = filename_tmp(3:end,:);
for i = 1:size(filenames,2)      %step through each filename and pull the wanted word
    filename_tmp = filenames{i};
    strmask = isstrprop(filename_tmp,'punct');  %find the punctuations, we only want the last two
    strmask2 = filename_tmp=='&';       %we are going to exempt the & character
    strmask3 = isstrprop(filename_tmp,'wspace');    %included the spaces as well
    strmask = strmask-strmask2+strmask3;     %removed and append
    [x,y] = find(strmask==1);  %get the positions
    %all we need is the numbers in front
    f_tmp{i,:} = filename_tmp(1,1:y(1)-1);
    %filename
end
uq_names = unique(f_tmp);    %how many unique words are there
for j = 1:size(uq_names,1)      %step through unique names
    tmp = [];
    for k = 1:size(f_tmp,1)     %step through the all file names
        if strcmpi(uq_names(j,:),f_tmp(k,:)) %if there is a match (case insensitive now)
            tmp = vertcat(tmp,filenames(k));  %put it in the current file names temp
        end
    end
    filenames_out{j} = char(tmp);
end            
%-----------------------------------------------------------------------------------------------------------------------------
function [norm_data] = quantilenormloco(data)
%This little function makes sure quantile normalization works
%first check to make sure the input data does not have empty columns
data_chk = isnan(data);
chk_data = sum(data_chk,1);     %beging selection process for empty or singleton data columns
chk_data = abs(chk_data-size(data,1));  %nan items calculated
idx = find(chk_data<=1);    %index of the empty or singleton columns
if ~isempty(idx)     %there is an empty or singleton column, fix that
    norm_data = data;     %copy original matrix context for reconstitution
    reconidx = find(chk_data>1);   %index of reconstitution
    data(:,idx) = [];       %remove temporarily
    if ~isempty(data)   %if all is gone, don't do it
        norm_tmp = quantilenorm(data,'Median',1);     %normalize
        norm_data(:,reconidx) = norm_tmp;   %reconstituted in original contex
    end
else    %no problems go on
    norm_data = quantilenorm(data,'Median',1);
end